#load necessary libraries
library(xgboost)
library(caret)
train=readRDS("train.rds")
test=readRDS("test.rds")
print(train)
print(test)
#train:test -> 80%:20%
train_train=train[1:(0.8*dim(train)[1]),]
test_train=train[(0.8*dim(train)[1]+1):(dim(train)[1]),]
Use caret to find the best hyperparameters using 5-fold cv
#set up grid to choose the best values for the following parameters
control <-trainControl(method="cv", number=5)
xgb_grid = expand.grid(
nrounds = 1000,
eta = c(0.1, 0.05, 0.01),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree=1,
min_child_weight=c(1, 2, 3, 4 ,5),
subsample=1)
xgb_caret <- train(x=train_train[-230], y=train_train[,230], method='xgbTree', trControl= control, tuneGrid=xgb_grid)
xgb_caret$bestTune
#From the run, the best tune parameters are max depth = 4, eta = 0.1 and min_child_weight = 4
train_labels <- train_train[,230]
# put our testing & training data into two seperates Dmatrixs objects
dtrain <- xgb.DMatrix(data = as.matrix(train_train[,-230]), label= train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_train[,-230]))
#Using the best parameters from the tune
parameters <-list(
objective = "reg:linear",
booster = "gbtree",
eta=0.1, #default = 0.3
gamma=0,
max_depth=4, #default=6
min_child_weight=4, #default=1
subsample=1,
colsample_bytree=1
)
#cross validation using the inbuild xgb.cv() to find the best no of rounds.
set.seed(123)
xgbcv <- xgb.cv( params = parameters, data = dtrain, nrounds = 10000, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F)
[1] train-rmse:11.413846+0.007614 test-rmse:11.413752+0.030885
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.
[41] train-rmse:7.644597+0.005097 test-rmse:7.644520+0.032602
[81] train-rmse:5.122702+0.003400 test-rmse:5.123048+0.029993
[121] train-rmse:3.435296+0.002239 test-rmse:3.435759+0.026427
[161] train-rmse:2.306322+0.001437 test-rmse:2.307605+0.022512
[201] train-rmse:1.550925+0.000961 test-rmse:1.552953+0.018722
[241] train-rmse:1.045665+0.000752 test-rmse:1.048705+0.015871
[281] train-rmse:0.708045+0.000700 test-rmse:0.713113+0.013718
[321] train-rmse:0.482904+0.000802 test-rmse:0.491152+0.012104
[361] train-rmse:0.333550+0.000965 test-rmse:0.345948+0.010399
[401] train-rmse:0.235578+0.001194 test-rmse:0.253653+0.009072
[441] train-rmse:0.172388+0.001377 test-rmse:0.197499+0.008986
[481] train-rmse:0.132697+0.001564 test-rmse:0.165074+0.010620
[521] train-rmse:0.108632+0.001682 test-rmse:0.147400+0.012964
[561] train-rmse:0.094332+0.001771 test-rmse:0.138112+0.015068
[601] train-rmse:0.085758+0.001864 test-rmse:0.133278+0.016492
[641] train-rmse:0.080401+0.001925 test-rmse:0.130593+0.017600
[681] train-rmse:0.076742+0.001888 test-rmse:0.129021+0.018262
[721] train-rmse:0.074035+0.001825 test-rmse:0.127961+0.018783
[761] train-rmse:0.071941+0.001759 test-rmse:0.127210+0.019097
[801] train-rmse:0.070175+0.001732 test-rmse:0.126631+0.019305
[841] train-rmse:0.068606+0.001697 test-rmse:0.126181+0.019362
[881] train-rmse:0.067198+0.001641 test-rmse:0.125828+0.019394
[921] train-rmse:0.065869+0.001604 test-rmse:0.125473+0.019420
[961] train-rmse:0.064626+0.001572 test-rmse:0.125227+0.019502
[1001] train-rmse:0.063460+0.001568 test-rmse:0.124946+0.019540
[1041] train-rmse:0.062388+0.001617 test-rmse:0.124762+0.019490
[1081] train-rmse:0.061345+0.001632 test-rmse:0.124658+0.019473
[1121] train-rmse:0.060370+0.001645 test-rmse:0.124539+0.019446
[1161] train-rmse:0.059343+0.001610 test-rmse:0.124447+0.019409
[1201] train-rmse:0.058397+0.001612 test-rmse:0.124355+0.019398
[1241] train-rmse:0.057484+0.001613 test-rmse:0.124269+0.019407
[1281] train-rmse:0.056640+0.001594 test-rmse:0.124162+0.019396
[1321] train-rmse:0.055808+0.001611 test-rmse:0.124112+0.019350
Stopping. Best iteration:
[1313] train-rmse:0.055956+0.001618 test-rmse:0.124112+0.019363
#based on the best tune parameters and nrounds = 1032
xgb_mod <- xgb.train(data = dtrain, params= parameters, nrounds = 1032)
XGBpred <- predict(xgb_mod, dtest)
head(XGBpred)
[1] 12.13067 13.05577 11.84750 12.02057 11.98641 12.43907
predictions_XGB <-XGBpred #need to reverse the log to the real values
#evaluation of results
cor(predictions_XGB,test_train[,230])
[1] 0.9417143
rmse(test_train[,230],predictions_XGB)
[1] 0.1320425
#visualizing the results
plot(exp(predictions_XGB),exp(test_train[,230]),xlab="Predicted Label",ylab="Actual Label",main="Plot of Actual Against Predicted Labels")
lin.mod=lm(exp(test_train[,230])~exp(predictions_XGB))
pr.lm=predict(lin.mod)
lines(pr.lm~exp(predictions_XGB), col="blue", lwd=0.5)
lines(c(0,450000), c(0,450000))
legend("topleft", legend=c("fitted line", "45 degree line"),col=c("blue", "black"), lty=1, cex=0.8)
#view variable importance plot
#install.packages("Ckmeans.1d.dp")
library(Ckmeans.1d.dp) #required for ggplot clustering
mat <- xgb.importance (feature_names = colnames(train_train[,-230]),model = xgb_mod)
xgb.ggplot.importance(importance_matrix = mat[1:20], rel_to_first = TRUE)